/*
	Estimates the degree of selection bias into the Glassdoor dataset for
	Korean college graduates employed in South Korean
	Output:	Figure C1o
			Selection_SouthKorea.csv
*/

local seed "C:\Users\jsock\Dropbox\Research\GD\International"

local dataPath "`seed'/Data"
local inputPath "`seed'/InputData"
local figurePath "`seed'/Replication/Figures"
local tablePath "`seed'/Replication/Tables"
local estimatePath "`seed'/Replication/Estimates"
local tempPath "`seed'/Replication/TempData"

********************************************************
* Save exchange rates as dta file
********************************************************

clear 
set more off
set matsize 10000

insheet using "`inputPath'/Exchange_rates_2022.csv", comma
 
keep country_glassdoor year ppp_xrat  

save `tempPath'/Exchange_rates.dta, replace 

*--------------------------------
* Thresholds used for sample selection
*--------------------------------

scalar country_premia_thresh = 25

scalar selection_thresh = 25

scalar school_thresh = 25 

local currentCountry = "South Korea"
local saveCountry = "SouthKorea"

********************************************************
* Read in raw data and calculate school averages
********************************************************

clear 
set more off
set matsize 10000

insheet using "`inputPath'/SK_raw_data.csv", comma
 
* Keep college graduates
keep if q1 == 7 

* Keep if has earnings
destring q26, replace force
keep if inrange(q26,1,5)

* Create annualized earnings
destring q26_1_etc , replace force
destring q26_2_etc , replace force
destring q26_3_etc , replace force
destring q26_4_etc , replace force
destring q26_5_etc , replace force

replace q26_1_etc = . if q26_1_etc >= 99999
replace q26_2_etc = . if q26_2_etc >= 99999
replace q26_3_etc = . if q26_3_etc >= 99999
replace q26_4_etc = . if q26_4_etc >= 99999
replace q26_5_etc = . if q26_5_etc >= 99999

replace q26_1_etc = q26_1_etc if q26 == 1 & q26_1_etc != 99999
replace q26_2_etc = q26_2_etc * 12 if q26 == 2 & q26_2_etc != 99999 

generate annual_earnings = .
replace annual_earnings = q26_1_etc if q26 == 1 
replace annual_earnings = q26_2_etc if q26 == 2 

drop if annual_earnings == . 

* Create cohorts (years after graduation)
destring q7_1 , replace force
generate years_from_graduate = .
replace years_from_graduate = 2017 - q7_1
replace years_from_graduate = . if ~inrange(years_from_graduate,0,12)

drop if years_from_graduate == . 

collapse (mean) annual_earnings [aw=wt], by(college years_from_graduate)

destring college, replace

generate year = 2017

replace annual_earnings = annual_earnings * 10000

* Merge in college names

preserve

	clear
		
	insheet using "`inputPath'/SK_college_id.csv", comma
 
	keep label v3
	rename label college
	rename v3 school
	
	keep if school != ""
	
	tempfile in_results
	save `in_results', emptyok
	
restore

merge m:1 college using `in_results'
keep if _merge == 3
drop _merge

* Read in exchange rates
generate country_glassdoor = "`currentCountry'"

merge m:1 country_glassdoor year using `tempPath'/Exchange_rates.dta

keep if _merge == 3
drop _merge

* Convert to ppp
generate undergrad_pay =  annual_earnings * ppp_xrat 

sort school years_from_graduate 

collapse (mean) undergrad_pay , by(school years_from_graduate)

replace school = school + " University" if school != "University of Seoul" & school != "Korea Polytechnic University" & school != "Hankuk University of Foreign Studies" & school != "Korea Advanced Institute of Science and Technology"

replace school = "KAIST" if school == "Korea Advanced Institute of Science and Technology"
replace school = "Wonkwang University" if school == "Wongwang University"
replace school = "University of Ulsan" if school == "Ulsan University"
replace school = "Sahmyook University" if school == "Samyook University"
replace school = "Soongsil University" if school == "Soongshil University"
replace school = "Hongik University" if school == "Hongik (Sejong campus) University"
replace school = "Ewha Womans University" if school == "Ehwa Women University"
replace school = "Chung-Ang University" if school == "Chungang (Ansung campus) University"
replace school = "Chung-Ang University" if school == "Chungang (Seoul campus) University"
replace school = "Ajou University" if school == "Aju University"
replace school = "University of Incheon" if school == "Incheon University"
replace school = "Dong-A University" if school == "Donga University"
replace school = "Yonsei University" if school == "Yonsei (Wonju campus) University"
replace school = "Korea University" if school == "Korea (Sejong campus) University"
replace school = "Gachon University" if school == "Gacheon University"
replace school = "Ajou University" if school == "Aju University"

collapse (mean) undergrad_pay , by(school years_from_graduate)

save `tempPath'/`saveCountry'_universities_earnings_merge.dta, replace

********************************************************
* Read in country gdp to get universitycountry gdp
********************************************************

clear 
set more off
set matsize 10000
set scheme s1mono

* Set path and load data 
insheet using "`dataPath'/Salaries_international_dataset_main.csv", comma
drop v1

drop if jobtitle == ""

drop metro shortname 
drop city basecurrency country_iso dateval  
drop gender
drop sectorname iscurrentjobflag 

generate national_rank_pct = national_rank / numberuniversities

*--------------------------------
* Exclude users that leave more than 10 reviews
*--------------------------------

sort fk_userid yearofsalary 

by fk_userid : gen userReviews = _N

drop if userReviews > 10

drop userReviews

*--------------------------------
* Convert with ppp exchange rates
*--------------------------------

generate logbase = ln(basesalary * ppp_xrat)

drop if logbase == .

*--------------------------------
* Exclude outliers in base pay
*--------------------------------

scalar scalingThresh = 10

generate realbase = basesalary * ppp_xrat
generate outside_thresh =  (realbase < (1/scalingThresh) * gdppw) | (realbase > scalingThresh * gdppw) 
drop realbase

drop if outside_thresh

*--------------------------------
* Keep only Australia
*--------------------------------

drop if employertypecode == "SELF_EMPLOYED" 

keep if countryname == "`currentCountry'"

keep if universitycountry == "`currentCountry'"

*--------------------------------
* Generate indicators for valid in education analysis
*--------------------------------

generate hasDegree = degree != "UNMATCHED" & degree != "missing" & degree != ""  & degree != "HIGHSCHOOL"

generate uniDegree = degree == "BACHELORS" 

generate hasSchool = school != ""

generate valid_educ = uniDegree & hasDegree & hasSchool & universitycountry != ""

keep if valid_educ 

********************************************************
* Create valid sampels for one and two year comparisons
********************************************************

*--------------------------------
* Years since school ended
*--------------------------------

split endschool, p("-")

destring endschool1, gen(end_year)

generate years_from_graduate = yearofsalary - end_year

*-------------------------------- 
* Add comparison earnings
*--------------------------------

merge m:1 school years_from_graduate using `tempPath'/`saveCountry'_universities_earnings_merge.dta

keep if _merge == 3

*--------------------------------
* Years since school ended
*--------------------------------

bys school years_from_graduate : egen avg_grad = median(logbase) 

bys school years_from_graduate : gen n_grad = _N

*--------------------------------
* Compare External data and GD data for recent graduates
*--------------------------------

preserve

	collapse (max) avg_grad n_grad undergrad_pay national_rank_pct, by(school years_from_graduate)
	
	generate avg_median_undergrad = ln(undergrad_pay)
		
	* Difference between scorecard and gd 
	generate diff_grad = avg_grad - avg_median_undergrad	
	
	keep if diff_grad != .
	
	rename n_grad n_gd 
	bys school : egen n_grad = sum(n_gd)
	
	collapse (mean) diff_grad (max) n_grad national_rank_pct [aw=n_gd], by(school )
	
	keep if diff_grad != .
	
	* Get averages
	sum diff_grad
	local avg_equal = r(mean)
	scalar est_wtd_equal = r(mean)
	
	sum diff_grad [aw=n_grad]
	local avg_wtd_gd = r(mean)
	scalar est_wtd_gd = r(mean)
	scalar sum_wt_wtd_gd = r(sum_w)

		* Number of universities
		tab school if diff_grad != .
		scalar n_wt_wtd_gd = r(r)

	* Get averages for top 5%
	sum diff_grad if national_rank_pct <= 0.05 [aw=n_grad]
	local avg_wtd_gd_top5 = r(mean)
	scalar est_wtd_gd_top5 = r(mean)
	scalar sum_wt_wtd_gd_top5 = r(sum_w)

		* Number of universities
		tab school if diff_grad != . & national_rank_pct <= 0.05 
		scalar n_wt_wtd_gd_top5 = r(r)
	
	* Get averages for not top 5%
	sum diff_grad if national_rank_pct > 0.05 | national_rank_pct == . [aw=n_grad]
	local avg_wtd_gd_not5 = r(mean)
	scalar est_wtd_gd_not5 = r(mean)
	scalar sum_wt_wtd_gd_not5 = r(sum_w)
	
		* Number of universities
		tab school if diff_grad != . & (national_rank_pct > 0.05 | national_rank_pct == .)
		scalar n_wt_wtd_gd_not5 = r(r)

	* Weighted and unweighted kernel density
	twoway kdensity diff_grad [aw=n_grad], lcolor(gs3) lpattern(dash) xscale(r(-0.75(0.25)0.75)) xlabel(-0.75(0.25)0.75) ///
		xline(0, lcolor(black) lpattern(solid) lwidth(thin)) fcolor(none) ///
		xtitle("log difference within college") ytitle("density") title("") scale(1.2)
	graph export "`figurePath'/Figure_C1o.eps"	, replace

restore

*--------------------------------
* Export summary of selection estimates
*--------------------------------

preserve

	generate selection_est_equal = est_wtd_equal
	generate selection_est_wtd = est_wtd_gd
	generate selection_est_wtd_top5 = est_wtd_gd_top5
	generate selection_est_wtd_not5 = est_wtd_gd_not5

	generate selection_wtd_N = n_wt_wtd_gd
	generate selection_wtd_N_top5 = n_wt_wtd_gd_top5
	generate selection_wtd_N_not5 = n_wt_wtd_gd_not5

	generate selection_wtd_sum = sum_wt_wtd_gd
	generate selection_wtd_sum_top5 = sum_wt_wtd_gd_top5
	generate selection_wtd_sum_not5 = sum_wt_wtd_gd_not5

	keep universitycountry selection_*
	
	keep if _n == 1
	
	order universitycountry selection_wtd_N* selection_wtd_sum* selection_est_* 
	
	outsheet using "`estimatePath'\Selection_`saveCountry'.csv" , comma replace

restore



